{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from utils import *\n",
    "import tensorflow as tf\n",
    "from collections import Counter\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['negative', 'positive']\n",
      "10662\n",
      "10662\n"
     ]
    }
   ],
   "source": [
    "trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset.data, trainset.target = separate_dataset(trainset,1.0)\n",
    "print(trainset.target_names)\n",
    "print(len(trainset.data))\n",
    "print(len(trainset.target))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total words: 197234\n"
     ]
    }
   ],
   "source": [
    "texts = ' '.join(trainset.data)\n",
    "words = texts.split()\n",
    "word2freq = Counter(words)\n",
    "print(\"Total words:\", len(words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vocabulary size: 20465\n"
     ]
    }
   ],
   "source": [
    "_words = set(words)\n",
    "word2idx = {c: i for i, c in enumerate(_words)}\n",
    "idx2word = {i: c for i, c in enumerate(_words)}\n",
    "vocab_size = len(idx2word)\n",
    "indexed = [word2idx[w] for w in words]\n",
    "print('Vocabulary size:', vocab_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "class CBOW:\n",
    "    def __init__(self, sample_size, vocab_size, embedded_size, window_size=3):\n",
    "        self.X = tf.placeholder(tf.int32, shape=[None, 2*window_size])\n",
    "        self.Y = tf.placeholder(tf.int32, shape=[None, 1])\n",
    "        self.embedding = tf.Variable(tf.truncated_normal([vocab_size, embedded_size],\n",
    "                                                      stddev=1.0 / np.sqrt(embedded_size)))\n",
    "        self.bias = tf.Variable(tf.zeros([vocab_size]))\n",
    "        embedded = tf.nn.embedding_lookup(self.embedding, self.X)\n",
    "        embedded = tf.reduce_mean(embedded, axis=1)\n",
    "        self.cost = tf.reduce_mean(tf.nn.sampled_softmax_loss(\n",
    "            weights=self.embedding,\n",
    "            biases=self.bias,\n",
    "            labels=self.Y,\n",
    "            inputs=embedded,\n",
    "            num_sampled=sample_size,\n",
    "            num_classes=vocab_size))\n",
    "        self.optimizer = tf.train.AdamOptimizer().minimize(self.cost)\n",
    "        self.valid_dataset = tf.placeholder(tf.int32, shape=[None])\n",
    "        norm = tf.sqrt(tf.reduce_sum(tf.square(self.embedding), 1, keep_dims=True))\n",
    "        normalized_embeddings = self.embedding / norm\n",
    "        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, self.valid_dataset)\n",
    "        self.similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 128\n",
    "embedded_size = 128\n",
    "window_size = 3\n",
    "epoch = 10\n",
    "valid_size = 10\n",
    "nearest_neighbors = 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "model = CBOW(batch_size,vocab_size,embedded_size)\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_x(words, idx):\n",
    "    left = idx - window_size\n",
    "    right = idx + window_size\n",
    "    return words[left: idx] + words[idx+1: right+1]\n",
    "\n",
    "def make_xy(int_words):\n",
    "    x,y = [], []\n",
    "    for i in range(window_size, len(int_words)-window_size):\n",
    "        inputs = get_x(int_words, i)\n",
    "        x.append(inputs)\n",
    "        y.append(int_words[i])\n",
    "    return np.array(x), np.array(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, Y = make_xy(indexed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 1, avg loss 2.384114\n",
      "Nearest to to: his, what, it, in, all, and, movie, its,\n",
      "Nearest to compelling: comedy, does, good, other, a, down, feels, just,\n",
      "Nearest to 1984: underdone, showtunes, lyrics, requirement, plans, arrogant, concentrating, shiver,\n",
      "Nearest to many: too, one, scenes, does, our, was, down, so,\n",
      "Nearest to it: what, an, but, that, doesnt, you, to, so,\n",
      "Nearest to debut: can, leave, here, dont, everything, our, other, hard,\n",
      "Nearest to who: does, and, then, for, it, what, down, his,\n",
      "Nearest to amicable: comebacks, eternal, imagines, isolated, overloads, stationary, plummets, danilo,\n",
      "Nearest to fashioned: old, down, long, watch, comes, passion, bland, around,\n",
      "Nearest to david: youve, virtually, lives, and, take, kind, performances, takes,\n",
      "epoch 2, avg loss 2.032223\n",
      "Nearest to consolation: become, five, half, spaces, ages, success, much, man,\n",
      "Nearest to puts: music, together, seven, after, leave, almost, our, again,\n",
      "Nearest to sensual: completely, short, holes, several, series, mr, assured, substance,\n",
      "Nearest to be: might, want, may, dont, can, should, could, you,\n",
      "Nearest to slightest: demonstration, nerve, freshfaced, closet, scenarios, joan, details, perversity,\n",
      "Nearest to latest: year, only, again, other, fascinating, best, way, quickly,\n",
      "Nearest to to: it, his, make, what, about, down, have, away,\n",
      "Nearest to moviestarring: rome, gambol, cultura, raptures, posttarantino, junkcalorie, kurupt, radio,\n",
      "Nearest to comedy: about, is, control, tedious, for, movie, again, audience,\n",
      "Nearest to be: might, want, may, dont, can, should, could, you,\n",
      "epoch 3, avg loss 1.966328\n",
      "Nearest to to: trying, make, it, have, put, enough, what, manages,\n",
      "Nearest to the: in, of, films, version, most, and, longest, from,\n",
      "Nearest to have: could, been, would, you, never, dont, make, want,\n",
      "Nearest to movie: this, bit, about, satire, becomes, that, for, film,\n",
      "Nearest to close: again, me, away, could, seems, never, tedious, enough,\n",
      "Nearest to effects: special, leave, assured, perfect, draws, many, long, or,\n",
      "Nearest to fiction: science, past, accomplished, blood, audience, mind, poor, praise,\n",
      "Nearest to moved: you, want, be, probably, reason, recommend, arent, i,\n",
      "Nearest to legged: eight, freaks, dogpaddle, inventiveness, breaks, slight, hannibal, motion,\n",
      "Nearest to own: its, his, time, or, yet, for, my, away,\n",
      "epoch 4, avg loss 1.880857\n",
      "Nearest to with: such, human, an, film, story, work, his, script,\n",
      "Nearest to the: in, of, films, yard, most, power, shadow, line,\n",
      "Nearest to the: in, of, films, yard, most, power, shadow, line,\n",
      "Nearest to but: interesting, still, because, also, was, too, just, it,\n",
      "Nearest to the: in, of, films, yard, most, power, shadow, line,\n",
      "Nearest to that: movie, even, is, film, with, it, an, only,\n",
      "Nearest to and: his, dialogue, with, david, between, both, dancing, human,\n",
      "Nearest to a: is, gem, certain, romance, refreshingly, single, pleasant, classic,\n",
      "Nearest to and: his, dialogue, with, david, between, both, dancing, human,\n",
      "Nearest to its: own, way, head, only, movie, time, yet, his,\n",
      "epoch 5, avg loss 1.846353\n",
      "Nearest to wishywashy: here, told, something, cause, quickly, serious, stupid, isnt,\n",
      "Nearest to the: in, process, films, same, power, yard, lies, of,\n",
      "Nearest to to: trying, manage, used, tends, willing, finish, fails, able,\n",
      "Nearest to devotion: fortune, and, wacky, diggs, risks, accomplishes, wife, it,\n",
      "Nearest to already: understand, think, long, nothing, cant, seen, leave, stupid,\n",
      "Nearest to baffling: john, riveting, story, makes, uneven, fascinating, motion, bythenumbers,\n",
      "Nearest to is: film, movie, this, comedy, that, journey, performance, fascinating,\n",
      "Nearest to a: muddle, sentence, disappointment, funeral, gem, tearjerker, refreshing, single,\n",
      "Nearest to his: ugly, human, face, lacking, aspects, celebrates, friendship, energy,\n",
      "Nearest to constantly: premise, motion, poor, put, ugly, because, told, themselves,\n",
      "epoch 6, avg loss 1.783981\n",
      "Nearest to impact: emotional, lack, aspects, jokes, humor, theater, awkward, animation,\n",
      "Nearest to bittersweet: earnest, rich, style, friendship, dialogue, overblown, devastating, dark,\n",
      "Nearest to the: process, fate, transporter, emptiness, same, lies, lady, in,\n",
      "Nearest to tearjerker: a, single, mess, bravura, lot, muddle, gritty, thriller,\n",
      "Nearest to 11: damage, times, control, once, irritating, several, ago, violence,\n",
      "Nearest to director: uses, takes, strikes, barely, thoughtful, unexpected, offers, ugly,\n",
      "Nearest to business: first, meaning, plots, historical, plenty, silliness, movies, year,\n",
      "Nearest to an: offers, ugly, exercise, heartfelt, motion, intriguing, earnest, artist,\n",
      "Nearest to youll: if, here, alone, long, were, stupid, horrible, us,\n",
      "Nearest to beautiful: makes, wise, bitter, talented, past, thoughtful, idea, stupid,\n",
      "epoch 7, avg loss 1.701049\n",
      "Nearest to the: process, fate, transporter, oddest, inherent, yard, lady, biggest,\n",
      "Nearest to make: he, manages, tries, hit, watch, would, hard, me,\n",
      "Nearest to vividly: industry, funniest, conveys, kicks, girls, biggest, rest, latest,\n",
      "Nearest to lovingly: paranoia, ridiculous, loneliness, pacing, compassion, pain, mores, emotions,\n",
      "Nearest to wilt: reside, funniness, instalment, ivan, hijacks, awfulness, pushing, insideshowbiz,\n",
      "Nearest to our: skin, many, values, edge, children, girls, draws, adults,\n",
      "Nearest to the: process, fate, transporter, oddest, inherent, yard, lady, biggest,\n",
      "Nearest to perhaps: demographic, 100, long, after, one, latest, though, short,\n",
      "Nearest to michael: berg, caine, honest, leaden, occasionally, lame, directed, performances,\n",
      "Nearest to and: both, crisp, compassion, dialogue, backstabbing, singing, obvious, detailed,\n",
      "epoch 8, avg loss 1.645215\n",
      "Nearest to is: film, essentially, alias, missing, movie, result, charmless, this,\n",
      "Nearest to swap: player, relatives, deuces, rank, intermittent, artist, instance, ambiguous,\n",
      "Nearest to fire: reign, lots, parts, kind, instead, issue, old, outbursts,\n",
      "Nearest to together: put, serious, themselves, them, does, hes, mind, queens,\n",
      "Nearest to schoolers: jugglers, innovators, liberally, petins, smashups, undergraduate, screechingmetal, anxieties,\n",
      "Nearest to reeses: peanut, ate, bellyaching, wafer, reminiscence, sobaditsfunny, plucks, semi,\n",
      "Nearest to the: transporter, fate, process, majority, united, notion, same, awfulness,\n",
      "Nearest to might: called, have, want, would, could, gotten, should, said,\n",
      "Nearest to at: least, look, times, aimed, once, core, spiritual, subject,\n",
      "Nearest to bears: country, intensely, hes, he, well, today, long, taken,\n",
      "epoch 9, avg loss 1.540123\n",
      "Nearest to is: alias, shiri, athletic, charmless, missing, film, essentially, result,\n",
      "Nearest to best: years, one, same, funniest, swinging, entire, latest, films,\n",
      "Nearest to is: alias, shiri, athletic, charmless, missing, film, essentially, result,\n",
      "Nearest to thriller: psychological, mess, fascinating, trifle, glossy, demonstrates, compelling, constructed,\n",
      "Nearest to of: sort, depiction, equivalent, secrets, reminiscent, kind, prospect, modern,\n",
      "Nearest to a: muddle, tearjerker, spectator, sentence, funeral, cipher, disappointment, viewpoint,\n",
      "Nearest to story: gives, tone, love, man, premise, rejected, witless, manner,\n",
      "Nearest to and: both, crisp, dancing, singing, compassion, billy, fabulous, detailed,\n",
      "Nearest to tone: flat, drawn, story, witless, heartfelt, barely, presents, laughs,\n",
      "Nearest to glimpse: into, toothless, nightmare, mundane, subculture, psychological, dreary, expecting,\n",
      "epoch 10, avg loss 1.485667\n",
      "Nearest to head: morning, form, over, right, mind, your, place, out,\n",
      "Nearest to as: flippant, athlete, introduction, reliably, rank, described, wellconceived, qualify,\n",
      "Nearest to what: makes, actually, thought, saves, me, was, done, does,\n",
      "Nearest to sign: excuse, bad, sour, lot, terrible, trapped, yarn, passable,\n",
      "Nearest to its: own, sleeve, predecessor, fusty, racy, 2001, despite, clumsiness,\n",
      "Nearest to with: filled, deals, incident, nuance, fantasy, packed, energy, doses,\n",
      "Nearest to any: making, getting, attractive, emerges, wrong, form, 100, sitting,\n",
      "Nearest to manipulating: reprehensible, ripening, deblois, barbaras, fabulously, teenspeak, abdul, vinegar,\n",
      "Nearest to study: character, psychological, complete, portrait, dreary, heartfelt, motion, fascinating,\n",
      "Nearest to us: comes, hes, keep, themselves, become, stand, overcome, proper,\n"
     ]
    }
   ],
   "source": [
    "for i in range(epoch):\n",
    "    total_cost = 0\n",
    "    for k in range(0,(X.shape[0] // batch_size) * batch_size,batch_size):\n",
    "        batch_x = X[k:k+batch_size]\n",
    "        batch_y = Y[k:k+batch_size,np.newaxis]\n",
    "        cost,_ = sess.run([model.cost,model.optimizer],feed_dict={model.X:batch_x,\n",
    "                                                                 model.Y:batch_y})\n",
    "        total_cost += cost\n",
    "    total_cost /= (X.shape[0] // batch_size)\n",
    "    print('epoch %d, avg loss %f'%(i+1,total_cost))\n",
    "    random_valid_size = np.random.choice(indexed, valid_size)\n",
    "    similarity = sess.run(model.similarity,feed_dict={model.valid_dataset:random_valid_size})\n",
    "    for no, i in enumerate(random_valid_size):\n",
    "        valid_word = idx2word[i]\n",
    "        nearest = (-similarity[no, :]).argsort()[1:nearest_neighbors + 1]\n",
    "        log_str = 'Nearest to %s:' % valid_word\n",
    "        for k in range(nearest_neighbors):\n",
    "            close_word = idx2word[nearest[k]]\n",
    "            log_str = '%s %s,' % (log_str, close_word)\n",
    "        print(log_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
